Data Cleaning
There are several variables that need to be converted to factors.
These include: room_type, room_shared, room_private, and
host_is_superhost.
unique(london$room_type)
## [1] "Private room" "Entire home/apt" "Shared room"
unique(london$room_shared)
## [1] "False" "True"
unique(london$room_private)
## [1] "True" "False"
unique(london$host_is_superhost)
## [1] "False" "True"
The room_type column can converted into a 3 level factor variable
with the levels being: “Private room”, “Entire home/apt”, and “Shared
room”.
Another question is if room_shared and room_private are
independent.
mean(london$room_shared == london$room_private)
## [1] 0.4495259
Based on the data, they mean different things.
Columns multi and biz are factors stored as integers.
unique(london$multi)
## [1] 0 1
unique(london$biz)
## [1] 0 1
Switch these to True and False.
london$multi[london$multi == 0] = "False"
london$multi[london$multi == 1] = "True"
london$biz[london$biz == 0] = "False"
london$biz[london$biz == 1] = "True"
The bedrooms and person_capacity column should also be changed to a
factor.
unique(london$bedrooms)
## [1] 1 0 2 3 8 5 4
unique(london$person_capacity)
## [1] 2 3 4 6 5
Switch all to factors, also keep a copy of london with bedrooms and
person_capacity as integers
london$room_type = as.factor(london$room_type)
london$room_shared = as.factor(london$room_shared)
london$room_private = as.factor(london$room_private)
london$multi = as.factor(london$multi)
london$biz = as.factor(london$biz)
london$host_is_superhost = as.factor(london$host_is_superhost)
london_less_factors = london
london$person_capacity = as.factor(london$person_capacity)
london$bedrooms = as.factor(london$bedrooms)
head(london)
## X realSum room_type room_shared room_private person_capacity
## 1 0 121.1223 Private room False True 2
## 2 1 195.9124 Private room False True 2
## 3 2 193.3253 Private room False True 3
## 4 3 180.3899 Private room False True 2
## 5 4 405.7010 Entire home/apt False False 3
## 6 5 354.1946 Entire home/apt False False 2
## host_is_superhost multi biz cleanliness_rating guest_satisfaction_overall
## 1 False False False 6 69
## 2 False True False 10 96
## 3 False True False 10 95
## 4 False True False 9 87
## 5 False False True 7 65
## 6 False False True 9 93
## bedrooms dist metro_dist attr_index attr_index_norm rest_index
## 1 1 5.734117 0.4370940 222.8822 15.49341 470.0885
## 2 1 4.788905 1.4640505 235.3858 16.36259 530.1335
## 3 1 4.596677 0.4503062 268.9138 18.69325 548.9876
## 4 1 2.054769 0.1326705 472.3813 32.83707 1021.2711
## 5 0 4.491277 0.3541075 318.4915 22.13958 692.7754
## 6 0 4.467894 0.3507494 321.8646 22.37406 703.0686
## rest_index_norm lng lat
## 1 8.413765 -0.04975 51.52570
## 2 9.488466 -0.08475 51.54210
## 3 9.825922 -0.14585 51.54802
## 4 18.278973 -0.10611 51.52108
## 5 12.399473 -0.18797 51.49399
## 6 12.583702 -0.18805 51.49473
head(london_less_factors)
## X realSum room_type room_shared room_private person_capacity
## 1 0 121.1223 Private room False True 2
## 2 1 195.9124 Private room False True 2
## 3 2 193.3253 Private room False True 3
## 4 3 180.3899 Private room False True 2
## 5 4 405.7010 Entire home/apt False False 3
## 6 5 354.1946 Entire home/apt False False 2
## host_is_superhost multi biz cleanliness_rating guest_satisfaction_overall
## 1 False False False 6 69
## 2 False True False 10 96
## 3 False True False 10 95
## 4 False True False 9 87
## 5 False False True 7 65
## 6 False False True 9 93
## bedrooms dist metro_dist attr_index attr_index_norm rest_index
## 1 1 5.734117 0.4370940 222.8822 15.49341 470.0885
## 2 1 4.788905 1.4640505 235.3858 16.36259 530.1335
## 3 1 4.596677 0.4503062 268.9138 18.69325 548.9876
## 4 1 2.054769 0.1326705 472.3813 32.83707 1021.2711
## 5 0 4.491277 0.3541075 318.4915 22.13958 692.7754
## 6 0 4.467894 0.3507494 321.8646 22.37406 703.0686
## rest_index_norm lng lat
## 1 8.413765 -0.04975 51.52570
## 2 9.488466 -0.08475 51.54210
## 3 9.825922 -0.14585 51.54802
## 4 18.278973 -0.10611 51.52108
## 5 12.399473 -0.18797 51.49399
## 6 12.583702 -0.18805 51.49473
Variables attr_index and attr_index_norm will be collinear, as will
rest_index and rest_index_norm, so the non-normalized columns are
removed from the dataframe. Also, on initial analysis of linear models,
it was apparent that room_shared and room_private are exactly collinear
with room_type, so they will be removed. Also, remove the index X
cor(london$attr_index, london$attr_index_norm)
## [1] 1
cor(london$rest_index, london$rest_index_norm)
## [1] 1
london = subset(london, select = -c(attr_index, rest_index, room_shared, room_private, X))
london_less_factors = subset(london_less_factors, select = -c(attr_index, rest_index, room_shared, room_private, X))
See if there are any NAs:
sum(is.na(london))
## [1] 0
head(london)
## realSum room_type person_capacity host_is_superhost multi biz
## 1 121.1223 Private room 2 False False False
## 2 195.9124 Private room 2 False True False
## 3 193.3253 Private room 3 False True False
## 4 180.3899 Private room 2 False True False
## 5 405.7010 Entire home/apt 3 False False True
## 6 354.1946 Entire home/apt 2 False False True
## cleanliness_rating guest_satisfaction_overall bedrooms dist metro_dist
## 1 6 69 1 5.734117 0.4370940
## 2 10 96 1 4.788905 1.4640505
## 3 10 95 1 4.596677 0.4503062
## 4 9 87 1 2.054769 0.1326705
## 5 7 65 0 4.491277 0.3541075
## 6 9 93 0 4.467894 0.3507494
## attr_index_norm rest_index_norm lng lat
## 1 15.49341 8.413765 -0.04975 51.52570
## 2 16.36259 9.488466 -0.08475 51.54210
## 3 18.69325 9.825922 -0.14585 51.54802
## 4 32.83707 18.278973 -0.10611 51.52108
## 5 22.13958 12.399473 -0.18797 51.49399
## 6 22.37406 12.583702 -0.18805 51.49473
head(london_less_factors)
## realSum room_type person_capacity host_is_superhost multi biz
## 1 121.1223 Private room 2 False False False
## 2 195.9124 Private room 2 False True False
## 3 193.3253 Private room 3 False True False
## 4 180.3899 Private room 2 False True False
## 5 405.7010 Entire home/apt 3 False False True
## 6 354.1946 Entire home/apt 2 False False True
## cleanliness_rating guest_satisfaction_overall bedrooms dist metro_dist
## 1 6 69 1 5.734117 0.4370940
## 2 10 96 1 4.788905 1.4640505
## 3 10 95 1 4.596677 0.4503062
## 4 9 87 1 2.054769 0.1326705
## 5 7 65 0 4.491277 0.3541075
## 6 9 93 0 4.467894 0.3507494
## attr_index_norm rest_index_norm lng lat
## 1 15.49341 8.413765 -0.04975 51.52570
## 2 16.36259 9.488466 -0.08475 51.54210
## 3 18.69325 9.825922 -0.14585 51.54802
## 4 32.83707 18.278973 -0.10611 51.52108
## 5 22.13958 12.399473 -0.18797 51.49399
## 6 22.37406 12.583702 -0.18805 51.49473
Exploratory Analysis
diag = function(model, name){
resid = fitted(model) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(model)$r.squared
coefs = nrow(summary(model)$coeff)
data.frame(model = name, rmse = rmse, r2 = r2, coefficients = coefs )
}
Main Effects Analysis
full_additive_mod = lm(realSum ~ ., data = london)
summary(full_additive_mod)
##
## Call:
## lm(formula = realSum ~ ., data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -831.4 -85.5 -23.8 39.1 12758.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4938.8345 10038.9749 0.492 0.62276
## room_typePrivate room -162.8386 13.7048 -11.882 < 2e-16 ***
## room_typeShared room -209.0943 72.7440 -2.874 0.00406 **
## person_capacity3 27.1923 19.0931 1.424 0.15445
## person_capacity4 49.8029 17.1247 2.908 0.00365 **
## person_capacity5 49.8059 30.4829 1.634 0.10234
## person_capacity6 119.6746 28.4427 4.208 2.62e-05 ***
## host_is_superhostTrue 5.1666 14.3107 0.361 0.71809
## multiTrue -1.5363 13.0890 -0.117 0.90657
## bizTrue -4.0577 13.5098 -0.300 0.76392
## cleanliness_rating 3.5199 6.7225 0.524 0.60058
## guest_satisfaction_overall 0.8219 0.7052 1.165 0.24389
## bedrooms1 49.5374 21.4815 2.306 0.02115 *
## bedrooms2 271.2678 27.3603 9.915 < 2e-16 ***
## bedrooms3 720.8223 44.9469 16.037 < 2e-16 ***
## bedrooms4 205.7519 97.2730 2.115 0.03446 *
## bedrooms5 59.2926 264.9240 0.224 0.82291
## bedrooms8 252.2791 374.4747 0.674 0.50054
## dist 5.6117 4.2540 1.319 0.18717
## metro_dist -14.0458 6.6679 -2.106 0.03521 *
## attr_index_norm 8.1025 1.1976 6.766 1.47e-11 ***
## rest_index_norm 0.9235 1.7660 0.523 0.60105
## lng -162.7364 101.1096 -1.610 0.10757
## lat -95.4809 194.6245 -0.491 0.62374
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 373 on 5355 degrees of freedom
## Multiple R-squared: 0.2769, Adjusted R-squared: 0.2738
## F-statistic: 89.15 on 23 and 5355 DF, p-value: < 2.2e-16
linear_exp_results = diag(full_additive_mod, "full_additive_mod")
linear_exp_results
## model rmse r2 coefficients
## 1 full_additive_mod 372.2035 0.2768903 24
Fitting an additive model with no interactions to the data yields an
\(R^2 = 0.2769\). This seems quite
low.
Bedrooms and person_capacity have a large number of values, maybe a
model with them as numbers, not factors would give a better fit.
full_additive_less_factors_mod = lm(realSum ~ ., data = london_less_factors)
summary(full_additive_less_factors_mod)
##
## Call:
## lm(formula = realSum ~ ., data = london_less_factors)
##
## Residuals:
## Min 1Q Median 3Q Max
## -944.7 -96.5 -27.0 40.9 12762.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6191.8444 10152.2558 0.610 0.541955
## room_typePrivate room -185.8599 12.9509 -14.351 < 2e-16 ***
## room_typeShared room -241.7776 73.3550 -3.296 0.000987 ***
## person_capacity 40.2605 6.1731 6.522 7.58e-11 ***
## host_is_superhostTrue 5.6961 14.4673 0.394 0.693799
## multiTrue -2.9215 13.2068 -0.221 0.824937
## bizTrue -2.2367 13.6417 -0.164 0.869766
## cleanliness_rating 4.1461 6.7998 0.610 0.542057
## guest_satisfaction_overall 0.8838 0.7128 1.240 0.215047
## bedrooms 165.3640 11.3559 14.562 < 2e-16 ***
## dist 6.4223 4.2980 1.494 0.135170
## metro_dist -15.7648 6.7353 -2.341 0.019288 *
## attr_index_norm 8.0640 1.2120 6.654 3.15e-11 ***
## rest_index_norm 1.0429 1.7863 0.584 0.559363
## lng -198.8023 102.1465 -1.946 0.051677 .
## lat -123.5436 196.8196 -0.628 0.530227
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 377.6 on 5363 degrees of freedom
## Multiple R-squared: 0.2581, Adjusted R-squared: 0.256
## F-statistic: 124.4 on 15 and 5363 DF, p-value: < 2.2e-16
row = diag(full_additive_less_factors_mod, "full_additive_less_factors_mod")
row
## model rmse r2 coefficients
## 1 full_additive_less_factors_mod 377.0022 0.2581244 16
linear_exp_results = rbind(linear_exp_results, row)
Using bedrooms and person_capacity gives a slightly worse \(R^2 = 0.2581\), as compared to \(R^2 = 0.2769\) as factors.
We will use AIC to determine if any of the predictors can be
dropped.
aic_full_additive_mod = step(full_additive_mod, direction = "backward", trace = FALSE)
summary(aic_full_additive_mod)
##
## Call:
## lm(formula = realSum ~ room_type + person_capacity + guest_satisfaction_overall +
## bedrooms + dist + metro_dist + attr_index_norm + lng, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -827.2 -85.6 -24.2 39.8 12757.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.0360 58.4547 0.257 0.79701
## room_typePrivate room -163.0147 13.4701 -12.102 < 2e-16 ***
## room_typeShared room -209.2811 72.6692 -2.880 0.00399 **
## person_capacity3 26.0280 18.9537 1.373 0.16973
## person_capacity4 48.2794 16.9012 2.857 0.00430 **
## person_capacity5 48.2411 30.1650 1.599 0.10983
## person_capacity6 117.9900 27.8970 4.229 2.38e-05 ***
## guest_satisfaction_overall 1.1590 0.4594 2.523 0.01168 *
## bedrooms1 50.0675 21.2956 2.351 0.01876 *
## bedrooms2 272.5556 26.9564 10.111 < 2e-16 ***
## bedrooms3 721.8876 44.5407 16.207 < 2e-16 ***
## bedrooms4 206.8112 96.9888 2.132 0.03303 *
## bedrooms5 58.8775 264.6005 0.223 0.82392
## bedrooms8 250.6591 374.2151 0.670 0.50300
## dist 6.0538 4.1371 1.463 0.14344
## metro_dist -12.7656 6.2631 -2.038 0.04158 *
## attr_index_norm 8.6172 0.7616 11.314 < 2e-16 ***
## lng -189.5883 91.8427 -2.064 0.03904 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 372.9 on 5361 degrees of freedom
## Multiple R-squared: 0.2768, Adjusted R-squared: 0.2745
## F-statistic: 120.7 on 17 and 5361 DF, p-value: < 2.2e-16
row = diag(aic_full_additive_mod, "aic_full_additive_mod")
row
## model rmse r2 coefficients
## 1 aic_full_additive_mod 372.2376 0.2767578 18
linear_exp_results = rbind(linear_exp_results, row)
AIC eliminates just host_is_superhostTrue, multiTrue, bizTrue,
cleanliness_rating, rest_index_norm, and lat. The \(R^2\) value of this model left after AIC
elimination is almost the same at 0.2768 as compared to the full
additive model with an \(R^2\) of
0.2769.
We will now try BIC to see if this results in a smaller model.
bic_full_additive_mod = step(full_additive_mod, direction = "backward", k = log(nrow(london)), trace = FALSE)
summary(bic_full_additive_mod)
##
## Call:
## lm(formula = realSum ~ room_type + bedrooms + attr_index_norm +
## lng, data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -815.9 -88.6 -22.8 42.7 12748.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153.9668 23.6172 6.519 7.71e-11 ***
## room_typePrivate room -182.9595 12.0182 -15.224 < 2e-16 ***
## room_typeShared room -222.3489 72.6732 -3.060 0.00223 **
## bedrooms1 66.0112 20.8963 3.159 0.00159 **
## bedrooms2 329.1888 23.2151 14.180 < 2e-16 ***
## bedrooms3 811.5367 39.0830 20.764 < 2e-16 ***
## bedrooms4 280.0926 95.4377 2.935 0.00335 **
## bedrooms5 63.7316 265.0434 0.240 0.80998
## bedrooms8 220.5646 374.3695 0.589 0.55578
## attr_index_norm 8.1604 0.4403 18.536 < 2e-16 ***
## lng -231.4909 77.4179 -2.990 0.00280 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 373.6 on 5368 degrees of freedom
## Multiple R-squared: 0.273, Adjusted R-squared: 0.2716
## F-statistic: 201.6 on 10 and 5368 DF, p-value: < 2.2e-16
row = diag(bic_full_additive_mod, "bic_full_additive_model")
row
## model rmse r2 coefficients
## 1 bic_full_additive_model 373.2094 0.2729766 11
linear_exp_results = rbind(linear_exp_results, row)
BIC, as we would expect, produces a smaller model.
As there is little difference between these models, we will continue
the analysis with the full linear model. In summary, no one simple
additive model stands out. When we fit the model with bedrooms and
person_capacity as integers as in full_additive_less_factors_mod, we get
a poorer fit.
library(knitr)
kable(linear_exp_results)
| full_additive_mod |
372.2035 |
0.2768903 |
24 |
| full_additive_less_factors_mod |
377.0022 |
0.2581244 |
16 |
| aic_full_additive_mod |
372.2376 |
0.2767578 |
18 |
| bic_full_additive_model |
373.2094 |
0.2729766 |
11 |
Collinearity Analysis
Although collinearity likely will not affect prediction, it will
affect our ability to perform inference tests.
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
london_numeric = subset(london, select = c(realSum, cleanliness_rating, guest_satisfaction_overall, dist, metro_dist, attr_index_norm, rest_index_norm))
ggpairs(london_numeric)

Some of these data show collinearity and some look non-linear. In
particular, distance and metro_distance seem to be collinear.
Let us check for collinearity using the variance inflation
factor.
library(faraway)
##
## Attaching package: 'faraway'
## The following object is masked from 'package:GGally':
##
## happy
vif(full_additive_mod)
## room_typePrivate room room_typeShared room
## 1.800035 1.021575
## person_capacity3 person_capacity4
## 1.115019 1.736675
## person_capacity5 person_capacity6
## 1.329037 2.023084
## host_is_superhostTrue multiTrue
## 1.098007 1.334466
## bizTrue cleanliness_rating
## 1.621247 2.315125
## guest_satisfaction_overall bedrooms1
## 2.450057 3.309820
## bedrooms2 bedrooms3
## 3.659741 1.703254
## bedrooms4 bedrooms5
## 1.084696 1.008342
## bedrooms8 dist
## 1.007537 5.121200
## metro_dist attr_index_norm
## 2.750479 7.787592
## rest_index_norm lng
## 5.818696 1.801221
## lat
## 1.495864
vif(full_additive_mod)[unname(vif(full_additive_mod)) > 5]
## dist attr_index_norm rest_index_norm
## 5.121200 7.787592 5.818696
Per the textbook, predictors with a VIF of more than 5 are suspicious
for collinearity. The variables with VIF values more than 5 include
dist, attr_index_norm, and rest_index_norm. This makes sense as dist is
the distance to the city center, and it is likely that if this is small,
the number of attractions and restaurants (as measured by
attr_index_norm and rest_index_norm) would be high. There are more
attractions and restaurants near the city center. Also, metro_dist, the
distance to a metro station, would likely be small if dist is small.
These values are not tremendously larger than 5, so we will keep them in
model.
Transformation Analysis
What does the distribution of the room prices look like?
hist(london$realSum)

Maybe this would look better if we took the logarithm. This is a
typical transformation for prices that can vary over several orders of
magnitude.
hist(log(london$realSum), prob = TRUE)
curve(dnorm(x, mean = mean(log(london$realSum)), sd = sd(log(london$realSum))),
col = "darkorange", add = TRUE, lwd = 3)

This does look much better. The values are more spread out, and
approximate a normal distribution. Let us try to fit a model with the
log transformation of the response.
log_diag = function(model, name){
resid = exp(fitted(model)) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(model)$r.squared
coefs = nrow(summary(model)$coeff)
data.frame(model = name, rmse = rmse, r2 = r2, coefficients = coefs )
}
full_log_model = lm(log(realSum) ~ ., data = london)
summary(full_log_model)
##
## Call:
## lm(formula = log(realSum) ~ ., data = london)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0183 -0.2246 -0.0468 0.1744 4.3885
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.5611851 9.6613901 -0.369 0.71244
## room_typePrivate room -0.5814540 0.0131894 -44.085 < 2e-16 ***
## room_typeShared room -0.7594352 0.0700080 -10.848 < 2e-16 ***
## person_capacity3 0.1038847 0.0183750 5.654 1.65e-08 ***
## person_capacity4 0.2031329 0.0164806 12.326 < 2e-16 ***
## person_capacity5 0.2565597 0.0293364 8.745 < 2e-16 ***
## person_capacity6 0.3139412 0.0273729 11.469 < 2e-16 ***
## host_is_superhostTrue 0.0302614 0.0137724 2.197 0.02805 *
## multiTrue 0.0281435 0.0125967 2.234 0.02551 *
## bizTrue 0.0396468 0.0130017 3.049 0.00230 **
## cleanliness_rating 0.0339220 0.0064697 5.243 1.64e-07 ***
## guest_satisfaction_overall 0.0002596 0.0006787 0.382 0.70215
## bedrooms1 0.1087010 0.0206736 5.258 1.51e-07 ***
## bedrooms2 0.4175121 0.0263313 15.856 < 2e-16 ***
## bedrooms3 0.7081481 0.0432564 16.371 < 2e-16 ***
## bedrooms4 0.3917943 0.0936144 4.185 2.89e-05 ***
## bedrooms5 0.1216756 0.2549597 0.477 0.63321
## bedrooms8 0.9210053 0.3603900 2.556 0.01063 *
## dist -0.0086148 0.0040940 -2.104 0.03540 *
## metro_dist -0.0403219 0.0064171 -6.284 3.57e-10 ***
## attr_index_norm 0.0100078 0.0011525 8.683 < 2e-16 ***
## rest_index_norm 0.0046483 0.0016995 2.735 0.00626 **
## lng -0.5868284 0.0973067 -6.031 1.74e-09 ***
## lat 0.1688284 0.1873043 0.901 0.36744
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.359 on 5355 degrees of freedom
## Multiple R-squared: 0.6878, Adjusted R-squared: 0.6864
## F-statistic: 512.8 on 23 and 5355 DF, p-value: < 2.2e-16
log_diag(full_log_model, "full_log_model")
## model rmse r2 coefficients
## 1 full_log_model 371.2015 0.6877577 24
The log transformation gives a much better \(R^2\). The \(R^2\) value increased from around 0.27 to
0.6878 with the log transform. The log model also has much lower
RMSE.
We will now try transformations of other variables.
hist(log(london$dist))

hist(sqrt(london$dist), prob = TRUE)
curve(dnorm(x, mean = mean(sqrt(london$dist)), sd = sd(sqrt(london$dist))),
col = "darkorange", add = TRUE, lwd = 3)

hist(log(london$guest_satisfaction_overall))

hist((london$guest_satisfaction_overall) ^ 3, prob = TRUE)
curve(dnorm(x, mean = mean((london$guest_satisfaction_overall)^3), sd = sd((london$guest)^3)),
col = "darkorange", add = TRUE, lwd = 3)

hist(london$rest_index_norm)

hist(log(london$rest_index_norm))

hist(sqrt(london$dist))

Maybe transformations of other variables may be of help.
print("Baseline R^2 value for the full additive log model with all predictors and log transformation of the response realSum")
## [1] "Baseline R^2 value for the full additive log model with all predictors and log transformation of the response realSum"
summary(full_log_model)$r.squared
## [1] 0.6877577
transform_results = log_diag(lm(log(realSum) ~ . + I(dist^2), data = london), "dist^2")
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(dist^3), data = london), "dist^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(dist)), data = london), "sqrt(dist)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(dist)), data = london), "log(dist)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/dist), data = london), "1/dist"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(metro_dist^2), data = london), "metro_dist^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(metro_dist^3), data = london), "metro_dist^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(metro_dist)), data = london), "sqrt(metro_dist)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(metro_dist)), data = london), "log(metro_dist)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/metro_dist), data = london), "1/metro_dist"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(cleanliness_rating^2), data = london), "cleanliness_rating^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(cleanliness_rating^3), data = london), "cleanliness_rating^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(cleanliness_rating)), data = london), "sqrt(cleanliness_rating)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(cleanliness_rating)), data = london), "log(cleanliness_rating)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/cleanliness_rating), data = london), "1/cleanliness_rating"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(guest_satisfaction_overall^2), data = london), "guest_satisfaction_overall^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(guest_satisfaction_overall^3), data = london), "guest_satisfaction_overall^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(guest_satisfaction_overall)), data = london), "sqrt(guest_satisfaction_overall)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(guest_satisfaction_overall)), data = london), "log(guest_satisfaction_overall)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/guest_satisfaction_overall), data = london), "1/guest_satisfaction_overall"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(attr_index_norm^2), data = london), "attr_index_norm^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(attr_index_norm^3), data = london), "attr_index_norm^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(attr_index_norm)), data = london), "sqrt(attr_index_norm)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(attr_index_norm)), data = london), "log(attr_index_norm)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/attr_index_norm), data = london), "1/attr_index_norm"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(rest_index_norm^2), data = london), "rest_index_norm^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(rest_index_norm^3), data = london), "rest_index_norm^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(rest_index_norm)), data = london), "sqrt(rest_index_norm)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(rest_index_norm)), data = london), "log(rest_index_norm)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/rest_index_norm), data = london), "1/rest_index_norm"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(lat^2), data = london), "lat^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(lat^3), data = london), "lat^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(lat)), data = london), "sqrt(lat)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(lat)), data = london), "log(lat)"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/lat), data = london), "1/lat"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(lng^2), data = london), "lng^2"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(lng^3), data = london), "lng^3"))
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(sqrt(lng)), data = london), "sqrt(lng)"))
## Warning in sqrt(lng): NaNs produced
## Warning in exp(fitted(model)) - london$realSum: longer object length is not a
## multiple of shorter object length
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(log(lng)), data = london), "log(lng)"))
## Warning in log(lng): NaNs produced
## Warning in log(lng): longer object length is not a multiple of shorter object
## length
transform_results = rbind(transform_results, log_diag(lm(log(realSum) ~ . + I(1/lng), data = london), "1/lng"))
transform_results
## model rmse r2 coefficients
## 1 dist^2 371.0921 0.6882736 25
## 2 dist^3 371.0958 0.6883761 25
## 3 sqrt(dist) 371.2430 0.6878211 25
## 4 log(dist) 371.3637 0.6888231 25
## 5 1/dist 371.0915 0.6884597 25
## 6 metro_dist^2 371.1040 0.6883506 25
## 7 metro_dist^3 371.1032 0.6883179 25
## 8 sqrt(metro_dist) 371.2183 0.6880650 25
## 9 log(metro_dist) 371.2493 0.6878929 25
## 10 1/metro_dist 371.2067 0.6877582 25
## 11 cleanliness_rating^2 369.8849 0.6955014 25
## 12 cleanliness_rating^3 369.7589 0.6961010 25
## 13 sqrt(cleanliness_rating) 370.1383 0.6941168 25
## 14 log(cleanliness_rating) 370.2321 0.6935538 25
## 15 1/cleanliness_rating 370.4097 0.6924280 25
## 16 guest_satisfaction_overall^2 370.0537 0.6937106 25
## 17 guest_satisfaction_overall^3 369.9344 0.6944921 25
## 18 sqrt(guest_satisfaction_overall) 370.2819 0.6922965 25
## 19 log(guest_satisfaction_overall) 370.3638 0.6918127 25
## 20 1/guest_satisfaction_overall 370.5166 0.6909465 25
## 21 attr_index_norm^2 371.1330 0.6900554 25
## 22 attr_index_norm^3 371.1424 0.6891873 25
## 23 sqrt(attr_index_norm) 371.0346 0.6917055 25
## 24 log(attr_index_norm) 370.9720 0.6921088 25
## 25 1/attr_index_norm 370.9634 0.6905069 25
## 26 rest_index_norm^2 370.6398 0.6921916 25
## 27 rest_index_norm^3 370.8914 0.6899119 25
## 28 sqrt(rest_index_norm) 370.4616 0.6956432 25
## 29 log(rest_index_norm) 370.4553 0.6962326 25
## 30 1/rest_index_norm 370.7523 0.6933897 25
## 31 lat^2 370.8874 0.6893750 25
## 32 lat^3 370.8873 0.6893756 25
## 33 sqrt(lat) 371.2015 0.6877577 24
## 34 log(lat) 371.2015 0.6877577 24
## 35 1/lat 370.8878 0.6893731 25
## 36 lng^2 370.8410 0.6896032 25
## 37 lng^3 371.0255 0.6886020 25
## 38 sqrt(lng) 481.9348 0.7524618 24
## 39 log(lng) 481.9029 0.7526732 24
## 40 1/lng 371.1954 0.6879205 25
The biggest improvement in terms of RMSE seems to be with
guest_satisfaction_overall^3.
Interaction Analysis
Next do a model with interactions, and then reduce the number of
variables with backward AIC. We will start with the full log model.
full_interact = lm(log(realSum) ~ .^2, data = london)
interact_results = log_diag(full_interact, "full_interact")
interact_results
## model rmse r2 coefficients
## 1 full_interact 349.2925 0.7335292 214
The full interaction model has a higher \(R^2\) value than any of the transformation
models above. We will have to check if there is overfitting.The
full_interact model has a significantly lower RMSE than the
transformation models.
The full interaction model is quite large. Let us use AIC and BIC to
reduce the number of predictors.
#aic_full_interact = step (full_interact, direction = "backward", trace = FALSE)
#row = log_diag(aic_full_interact, "aic_full_interact")
#row
#interact_results = rbind(aic_full_interact, "aic_full_interact")
AIC decreases the number of coefficients from 214 in the full
interaction model to 113 with a decrease in \(R^2\) from 0.7335292 to 0.7294467.
Try BIC to get a smaller model
#bic_full_interact = step (full_interact, direction = "backward", trace = FALSE, k = log(nrow(london)))
#row = log_diag(bic_full_interact, "bic_full_interact")
#row
#interact_results = rbind(bic_full_interact, "bic_full_interact")
BIC decreases the number of coefficients from 214 in the full
interaction model to 45 with a decrease in \(R^2\) from 0.7335292 to 0.7171065.
Putting It Together
Try a model with transformations and interactions. Start with the
model from the BIC elimination above and add the 1/rest_index_norm
transformation.
combined_model = lm(log(realSum) ~ room_type + person_capacity + multi +
biz + cleanliness_rating + guest_satisfaction_overall + guest_satisfaction_overall^3 + bedrooms +
dist + metro_dist + attr_index_norm + rest_index_norm + lng +
lat + room_type:bedrooms + multi:guest_satisfaction_overall +
biz:cleanliness_rating + biz:lat + cleanliness_rating:guest_satisfaction_overall +
cleanliness_rating:attr_index_norm + bedrooms:attr_index_norm +
dist:metro_dist + dist:rest_index_norm + dist:lng + dist:lat +
metro_dist:attr_index_norm + metro_dist:rest_index_norm +
attr_index_norm:rest_index_norm + rest_index_norm:lat, data = london)
combined_results = log_diag(combined_model, "combined_model")
combined_results
## model rmse r2 coefficients
## 1 combined_model 359.7816 0.7171065 45
Adding 1/rest_index_norm marginally increases \(R^2\).
Let us remove predictors to increase explanatory power. I removed
predictors with collinearity and low p-values
smaller_combined_model = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london)
vif(smaller_combined_model)
## room_typePrivate room
## 1.797109
## room_typeShared room
## 1.020941
## person_capacity3
## 1.114337
## person_capacity4
## 1.732889
## person_capacity5
## 1.328297
## person_capacity6
## 2.018781
## cleanliness_rating
## 11.830130
## guest_satisfaction_overall
## 12.912596
## bedrooms1
## 3.358780
## bedrooms2
## 3.691527
## bedrooms3
## 1.711460
## bedrooms4
## 1.085808
## bedrooms5
## 1.007189
## bedrooms8
## 1.006449
## dist
## 3.636849
## attr_index_norm
## 3.735503
## lng
## 1.522141
## guest_satisfaction_overall:multiTrue
## 1.313012
## cleanliness_rating:bizTrue
## 1.616445
## cleanliness_rating:guest_satisfaction_overall
## 36.445683
## dist:rest_index_norm
## 1.559669
## attr_index_norm:metro_dist
## 1.538521
row = log_diag(smaller_combined_model, "smaller_combined_model")
row
## model rmse r2 coefficients
## 1 smaller_combined_model 369.9289 0.6991838 23
combined_results = rbind(smaller_combined_model, "smaller_combined_model")
Assumption Analysis
assumpt = function(model, pcol = "gray", lcol = "dodgerblue", alpha = 0.05, plotit = TRUE, testit = TRUE){
if (plotit == TRUE){
par(mfrow=c(1,2))
plot(fitted(model), resid(model), col = pcol, pch = 20,
xlab = "Fitted", ylab = "Residuals", main = paste("Fitted vs. Residuals for ", substitute(model), sep = ""))
abline(h = 0, col = lcol, lwd = 2)
qqnorm(resid(model), main = paste("Normal Q-Q Plot for ", substitute(model), sep = ""), col = pcol, pch = 20)
qqline(resid(model), col = lcol, lwd = 2)
}
if (testit == TRUE){
#test = shapiro.test(resid(model))$p.value
#res = ifelse (test > alpha, "Fail to Reject", "Reject")
#list("p_val" = test, "decision" = res)
}
}
assumpt(combined_model)

## NULL
assumpt(smaller_combined_model)

## NULL
hist(fitted(smaller_combined_model))

hist(log(london$realSum))

It looks like the log transformation of the response is not perfect.
We will try the Box-Cox transformation.
invBoxCox = function(x, lambda)
if (lambda == 0) exp(x) else (lambda*x + 1)^(1/lambda)
library(MASS)
bc = boxcox(full_additive_mod)

(lambda = bc$x[which.max(bc$y)])
## [1] -0.2626263
lambda
## [1] -0.2626263
bc_model = lm(((realSum^lambda-1)/lambda) ~ ., data = london)
assumpt(bc_model)

## NULL
hist(((london$realSum^lambda-1)/lambda), prob = TRUE)
curve(dnorm(x, mean = mean(((london$realSum^lambda-1)/lambda)), sd = sd(((london$realSum^lambda-1)/lambda))),
col = "darkorange", add = TRUE, lwd = 3)
residuals = unname(resid(bc_model))[1:4998]
shapiro.test(residuals)
##
## Shapiro-Wilk normality test
##
## data: residuals
## W = 0.9592, p-value < 2.2e-16
resid = invBoxCox(fitted(bc_model), lambda) - london$realSum
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(bc_model)$r.squared
coefs = nrow(summary(bc_model)$coeff)
data.frame(model = "bc_model", rmse = rmse, r2 = r2, coefficients = coefs )
## model rmse r2 coefficients
## 1 bc_model 370.6821 0.6950315 24

Removal of Unusual Values
In the above analysis, it looks as if the smaller_combined_model
works the best. We will now eliminate unusual values.
plot(smaller_combined_model)
## Warning: not plotting observations with leverage one:
## 207




unusual_index = cooks.distance(smaller_combined_model)>(4/nrow(london))
london_no_unusual = london[!unusual_index,]
london_no_unusual = na.omit(london_no_unusual)
nrow(london) - nrow(london_no_unusual)
## [1] 248
(nrow(london) - nrow(london_no_unusual))/nrow(london)
## [1] 0.04610522
max(london_no_unusual$realSum)
## [1] 2421.27
Removing points with a cooks distance > 4/n eliminates 248 points,
or 4.6% of the total number of rows.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
smaller_combined_no_unusual = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london_no_unusual)
log_diag(smaller_combined_no_unusual, "smaller_combined_no_unusual")
## Warning in exp(fitted(model)) - london$realSum: longer object length is not a
## multiple of shorter object length
## model rmse r2 coefficients
## 1 smaller_combined_no_unusual 481.1051 0.7782234 21
assumpt(smaller_combined_no_unusual)

## NULL
plot(smaller_combined_no_unusual)


shapiro.test(resid(smaller_combined_no_unusual)[1:4980])
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_combined_no_unusual)[1:4980]
## W = 0.98767, p-value < 2.2e-16
bptest(smaller_combined_no_unusual)
##
## studentized Breusch-Pagan test
##
## data: smaller_combined_no_unusual
## BP = 88.623, df = 20, p-value = 1.289e-10
What if we exclude points with large leverage?
leverage_index = hatvalues(smaller_combined_no_unusual)>(2*mean(hatvalues(smaller_combined_no_unusual)))
sum(leverage_index)
## [1] 314
fewer = london[!leverage_index,]
smaller_b = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = fewer)
log_diag(smaller_b, "smaller_combined_no_unusual_b")
## Warning in exp(fitted(model)) - london$realSum: longer object length is not a
## multiple of shorter object length
## model rmse r2 coefficients
## 1 smaller_combined_no_unusual_b 477.6786 0.6965607 23
assumpt(smaller_b)

## NULL
plot(smaller_b)
## Warning: not plotting observations with leverage one:
## 201


shapiro.test(resid(smaller_b)[1:4980])
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_b)[1:4980]
## W = 0.90522, p-value < 2.2e-16
library(MASS)
smaller_combined_bc_nu = lm(((realSum^lambda-1)/lambda) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london_no_unusual)
assumpt(smaller_combined_bc_nu)

## NULL
hist(((london$realSum^lambda-1)/lambda), prob = TRUE)
curve(dnorm(x, mean = mean(((london$realSum^lambda-1)/lambda)), sd = sd(((london$realSum^lambda-1)/lambda))),
col = "darkorange", add = TRUE, lwd = 3)
residuals = unname(resid(smaller_combined_bc_nu))[1:4998]
shapiro.test(residuals)
##
## Shapiro-Wilk normality test
##
## data: residuals
## W = 0.99507, p-value = 5.315e-12
resid = invBoxCox(fitted(smaller_combined_bc_nu), lambda) - london$realSum
## Warning in invBoxCox(fitted(smaller_combined_bc_nu), lambda) - london$realSum:
## longer object length is not a multiple of shorter object length
rmse = sqrt(sum(resid^2)/nrow(london))
r2 = summary(smaller_combined_bc_nu)$r.squared
coefs = nrow(summary(smaller_combined_bc_nu)$coeff)
data.frame(model = "smaller_combined_bc_nu", rmse = rmse, r2 = r2, coefficients = coefs )
## model rmse r2 coefficients
## 1 smaller_combined_bc_nu 485.9085 0.7638112 21

Model Evaluation
sample = sample(c(TRUE, FALSE), nrow(london_no_unusual), replace=TRUE, prob=c(0.7,0.3))
london_train = london_no_unusual[sample, ]
london_test = london_no_unusual[!sample, ]
log_predict_diag = function(true_data, fit_data, model, dataset){
resid = exp(unname(fit_data)) - unname(true_data)
rmse = sqrt(sum(resid^2)/length(fit_data))
data.frame(model = model, dataset = dataset, rmse = rmse)
}
library(lmtest)
smaller_combined_no_unusual_train = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london_train)
test_fit = predict(smaller_combined_no_unusual_train, london_test)
eval_results = log_predict_diag(london_test$realSum, test_fit, "smaller_combined_no_unusual_test", "test")
eval_results
## model dataset rmse
## 1 smaller_combined_no_unusual_test test 110.5443
train_fit = predict(smaller_combined_no_unusual_train, london_train)
row = log_predict_diag(london_train$realSum, train_fit, "smaller_combined_no_unusual_train", "training")
row
## model dataset rmse
## 1 smaller_combined_no_unusual_train training 113.5196
eval_results = rbind(eval_results, row)
plot(exp(unname(test_fit)) ~ unname(london_test$realSum))
abline(1,1)

full_additive_no_unusual_train = lm(log(realSum) ~ ., data = london_train)
test_fit = predict(full_additive_no_unusual_train, london_test)
row = log_predict_diag(london_test$realSum, test_fit, "full_additive_no_unusual_test", "test")
row
## model dataset rmse
## 1 full_additive_no_unusual_test test 114.2543
eval_results = rbind(eval_results, row)
train_fit = predict(full_additive_no_unusual_train, london_train)
row = log_predict_diag(london_train$realSum, train_fit, "full_additive_no_unusual_train", "training")
row
## model dataset rmse
## 1 full_additive_no_unusual_train training 115.1031
eval_results = rbind(eval_results, row)
full_interact_no_unusual_train = lm(log(realSum) ~ .^2, data = london_train)
test_fit = predict(full_interact_no_unusual_train, london_test)
## Warning in predict.lm(full_interact_no_unusual_train, london_test): prediction
## from rank-deficient fit; attr(*, "non-estim") has doubtful cases
row = log_predict_diag(london_test$realSum, test_fit, "full_interact_no_unusual_train", "test")
row
## model dataset rmse
## 1 full_interact_no_unusual_train test 114.1859
eval_results = rbind(eval_results, row)
train_fit = predict(full_interact_no_unusual_train, london_train)
## Warning in predict.lm(full_interact_no_unusual_train, london_train): prediction
## from rank-deficient fit; attr(*, "non-estim") has doubtful cases
row = log_predict_diag(london_train$realSum, train_fit, "full_interact_no_unusual_train", "training")
row
## model dataset rmse
## 1 full_interact_no_unusual_train training 102.737
eval_results = rbind(eval_results, row)
kable(eval_results)
| smaller_combined_no_unusual_test |
test |
110.5443 |
| smaller_combined_no_unusual_train |
training |
113.5196 |
| full_additive_no_unusual_test |
test |
114.2543 |
| full_additive_no_unusual_train |
training |
115.1031 |
| full_interact_no_unusual_train |
test |
114.1859 |
| full_interact_no_unusual_train |
training |
102.7370 |
As we can see from the correlation plot using the predicted values
from smaller_combined_no_unusual_train, the model seems to fall down and
underestimate the true values of higher priced rooms. Maybe there is
some intangible reason for the expense of these listings that cannot be
explained by the predictors. What happens if we exclude the listings
with realSum values more than 1000?
london_no_unusual_no_outliers = london_no_unusual[!(london_no_unusual$realSum >1000),]
hA = hist(log(london_no_unusual_no_outliers$realSum), prob = TRUE, plot = FALSE)
## Warning in hist.default(log(london_no_unusual_no_outliers$realSum), prob =
## TRUE, : argument 'probability' is not made use of
hB = hist(log(london_no_unusual$realSum), prob = TRUE, plot = FALSE)
## Warning in hist.default(log(london_no_unusual$realSum), prob = TRUE, plot =
## FALSE): argument 'probability' is not made use of
plot(hB, col = "dodgerblue")
plot(hA, col = "darkorange", add = TRUE)

smaller_combined_no_unusual_no_outliers_mod = lm(log(realSum) ~ room_type + person_capacity + cleanliness_rating + guest_satisfaction_overall
+ bedrooms + dist + log(dist) + attr_index_norm + lng +
+ multi:guest_satisfaction_overall + biz:cleanliness_rating
+ cleanliness_rating:guest_satisfaction_overall
+ dist:rest_index_norm + metro_dist:attr_index_norm, data = london_no_unusual_no_outliers)
plot(smaller_combined_no_unusual_no_outliers_mod)




test_fit = predict(smaller_combined_no_unusual_no_outliers_mod, london_no_unusual_no_outliers)
plot(exp(unname(test_fit)) ~ unname(london_no_unusual_no_outliers$realSum))
abline(1,1)

resid = exp(fitted(smaller_combined_no_unusual_no_outliers_mod)) - london_no_unusual_no_outliers$realSum
rmse = sqrt(sum(resid^2)/nrow(london_no_unusual_no_outliers))
r2 = summary(smaller_combined_no_unusual_no_outliers_mod)$r.squared
coefs = nrow(summary(smaller_combined_no_unusual_no_outliers_mod)$coeff)
data.frame(model = "smaller_combined_no_unusual_no_outliers_mod", rmse = rmse, r2 = r2, coefficients = coefs )
## model rmse r2 coefficients
## 1 smaller_combined_no_unusual_no_outliers_mod 95.62301 0.7571374 22
shapiro.test(resid(smaller_combined_no_unusual_no_outliers_mod)[1:4999])
##
## Shapiro-Wilk normality test
##
## data: resid(smaller_combined_no_unusual_no_outliers_mod)[1:4999]
## W = 0.98774, p-value < 2.2e-16